/*
 * friso lexicon implemented functions.
 * 		used to deal with the friso lexicon, like: load,remove,match...
 * 
 * @author	chenxin <chenxin619315@gmail.com>
 */
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso.h"

#define __SPLIT_MAX_TOKENS__ 	5
#define __LEX_FILE_DELIME__ 	'#'
#define __FRISO_LEX_IFILE__	"friso.lex.ini"

//create a new lexicon
FRISO_API friso_dic_t friso_dic_new() 
{
	register uint_t t;
	friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC( 
			sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
	if ( dic == NULL ) {
		___ALLOCATION_ERROR___
	}

	for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
		dic[t] = new_hash_table();
	}

	return dic;
}

/**
 * default callback function to invoke
 * 	when free the friso dictionary . 
 *
 * @date 2013-06-12
 */
__STATIC_API__ void default_fdic_callback( hash_entry_t e ) 
{
	register uint_t i;
	friso_array_t syn;
	lex_entry_t lex = ( lex_entry_t ) e->_val;
	//free the lex->word
	FRISO_FREE( lex->word );	
	//free the lex->syn if it is not NULL
	if ( lex->syn != NULL ) 
	{	
		syn = lex->syn;
		for ( i = 0; i < syn->length; i++ ) {
			FRISO_FREE( syn->items[i] );
		}
		free_array_list( syn );
	}

	//free the e->_val
	//@date 2014-01-28 posted by mlemay@gmail.com
	FRISO_FREE(lex);
}

FRISO_API void friso_dic_free( friso_dic_t dic ) 
{
	register uint_t t;
	for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
		//free the hash table
		free_hash_table( dic[t], default_fdic_callback );
	}

	FRISO_FREE( dic );
}


//create a new lexicon entry
FRISO_API lex_entry_t new_lex_entry( 
		fstring word, 
		friso_array_t syn, 
		uint_t fre, 
		uint_t length, 
		uint_t type ) 
{
	lex_entry_t e = ( lex_entry_t ) 
		FRISO_MALLOC( sizeof( lex_entry_cdt ) );
	if ( e == NULL ) {
		___ALLOCATION_ERROR___
	} 

	//initialize.
	e->word	= word;
	e->syn	= syn;			//synoyum words array list.
	e->pos	= NULL;			//part of speech array list.
	//e->py	= NULL; //set to NULL first.
	e->fre	= fre;
	e->length	= (uchar_t) length;	//length
	e->rlen	= (uchar_t) length;	//set to length by default.
	e->type 	= (uchar_t) type;	//type
	e->ctrlMask	= 0;			//control mask.
	e->offset	= -1;

	return e;
}

/**
 * free the given lexicon entry.
 * you have to do three thing maybe:
 * 1. free where its syn items points to. (not implemented)
 * 2. free its syn. (friso_array_t)
 * 3. free its pos. (friso_array_t)
 * 4. free the lex_entry_t.
 */
FRISO_API void free_lex_entry( lex_entry_t e ) 
{
	//if ( e->syn != NULL ) {
	//    if ( flag == 1 ) free_array_list( e->syn);
	//    else free_array_list( e->syn );
	//}
	FRISO_FREE( e );
}


//add a new entry to the dictionary.
FRISO_API void friso_dic_add( 
		friso_dic_t dic, 
		friso_lex_t lex,
		fstring word, 
		friso_array_t syn ) 
{
	if ( lex >= 0 || lex < __FRISO_LEXICON_LENGTH__ ) 
	{
		//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
		hash_put_mapping( dic[lex], word, 
				new_lex_entry( word, syn, 0, 
					(uint_t) strlen(word),  (uint_t) lex ) );
	}
}

FRISO_API void friso_dic_add_with_fre( 
		friso_dic_t dic,
		friso_lex_t lex,
		fstring word, 
		friso_array_t syn, 
		uint_t frequency ) 
{
	if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
		hash_put_mapping( dic[lex], word, 
				new_lex_entry( word, syn, frequency, 
					( uint_t ) strlen(word), ( uint_t ) lex ) );
	}
}

/*
 * read a line from a specified stream.
 * 		the newline will be cleared.
 * 
 * @date	2012-11-24 
 */
FRISO_API fstring file_get_line( fstring __dst, FILE * _stream ) 
{
	register int c;
	fstring cs;

	cs = __dst;
	while ( ( c = fgetc( _stream ) ) != EOF ) 
	{
		if ( c == '\n' ) break;
		*cs++ = c; 
	}
	*cs = '\0';

	return ( c == EOF && cs == __dst ) ? NULL : __dst;
}

/*
 * static function to copy a string. 
 */
///instead of memcpy
__STATIC_API__ fstring string_copy( 
		fstring _src, 
		fstring __dst, 
		uint_t blocks ) 
{

	register fstring __src = _src;
	register uint_t t;

	for ( t = 0; t < blocks; t++ ) {
		if ( *__src == '\0' ) break; 
		__dst[t] = *__src++;
	}
	__dst[t] = '\0';

	return __dst;
}

/**
 * make a heap allocation, and copy the 
 * 	source fstring to the new allocation, and 
 * 	you should free it after use it . 
 *
 * @param _src		source fstring
 * @param blocks	number of bytes to copy
 */
__STATIC_API__ fstring string_copy_heap( 
		fstring _src, uint_t blocks ) 
{
	register uint_t t;

	fstring str = ( fstring ) 
		FRISO_MALLOC( blocks + 1 );
	if ( str == NULL ) {
		___ALLOCATION_ERROR___;
	}

	for ( t = 0; t < blocks; t++ ) {
		if ( *_src == '\0' ) break;
		str[t] = *_src++;
	}

	str[t] = '\0';
	return str;
}

/*
 * find the postion of the first appear of the given char.
 *	address of the char in the fstring will be return .
 *	if not found NULL will be return . 
 */
__STATIC_API__ fstring indexOf( fstring __str, char delimiter ) 
{
	uint_t i, __length__;

	__length__ = strlen( __str );
	for ( i = 0; i < __length__; i++ ) {
		if ( __str[i] == delimiter ) 
			return __str + i;
	}

	return NULL;
}

/**
 * load all the valid wors from a specified lexicon file . 
 *
 * @param dic		friso dictionary instance (A hash array)
 * @param lex		the lexicon type
 * @param lex_file	the path of the lexicon file
 * @param length	the maximum length of the word item
 */
FRISO_API void friso_dic_load( 
		friso_t friso,
		friso_config_t config,
		friso_lex_t lex,
		fstring lex_file,
		uint_t length ) 
{

	FILE * _stream;
	char __char[1024], _buffer[512];
	fstring _line;
	string_split_entry sse;

	fstring _word;
	char _sbuffer[512];
	fstring _syn;
	friso_array_t sywords;
	uint_t _fre;

	if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL ) 
	{
		while ( ( _line = file_get_line( __char, _stream ) ) != NULL ) 
		{
			//clear up the notes
			//make sure the length of the line is greater than 1.
			//like the single '#' mark in stopwords dictionary.
			if ( _line[0] == '#' && strlen(_line) > 1 ) continue;

			//handle the stopwords.
			if ( lex == __LEX_STOPWORDS__ )
			{
				//clean the chinese words that its length is greater than max length.
				if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
				friso_dic_add( friso->dic, __LEX_STOPWORDS__, 
						string_copy_heap( _line, strlen(_line) ), NULL ); 
				continue;
			}

			//split the fstring with '/'.
			string_split_reset( &sse, "/", _line); 
			if ( string_split_next( &sse, _buffer ) == NULL ) continue;

			//1. get the word.
			_word = string_copy_heap( _buffer, strlen(_buffer) );

			if ( string_split_next( &sse, _buffer ) == NULL ) 
			{
				//normal lexicon type, 
				//add them to the dictionary directly
				friso_dic_add( friso->dic, lex, _word, NULL ); 
				continue;
			}

			/*
			 * filter out the words that its length is larger
			 * 	than the specified limit.
			 * but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
			 * 	and __LEX_CEM_WORDS__.
			 */
			if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
					&& strlen( _word ) > length ) 
			{
				FRISO_FREE(_word);
				continue;
			}

			//2. get the synonyms words.
			_syn = NULL;
			if ( strcmp( _buffer, "null" ) != 0 )
				_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );

			//3. get the word frequency if it available.
			_fre = 0;
			if ( string_split_next( &sse, _buffer ) != NULL )
				_fre = atoi( _buffer );

			/**
			 * Here:
			 * split the synonyms words with mark "," 
			 * 	and put them in a array list if the synonyms is not NULL
			 */
			sywords = NULL;
			if ( config->add_syn && _syn != NULL ) 
			{
				string_split_reset( &sse, ",", _sbuffer );
				sywords = new_array_list_with_opacity(5);
				while ( string_split_next( &sse, _buffer ) != NULL ) 
				{
					if ( strlen(_buffer) > length ) continue;
					array_list_add( sywords, 
							string_copy_heap(_buffer, strlen(_buffer)) );
				}
				sywords = array_list_trim( sywords );
			}

			//4. add the word item
			friso_dic_add_with_fre( 
					friso->dic, lex, _word, sywords, _fre );
		} 

		fclose( _stream );
	} else {
		printf("Warning: Fail to open lexicon file %s\n", lex_file);
	} 
}


/**
 * get the lexicon type index with the specified 
 * 	type keywords . 
 *
 * @see		friso.h#friso_lex_t
 * @param 	_key
 * @return 	int
 */
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key ) 
{
	if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
		return __LEX_CJK_WORDS__;
	} 
	else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
		return __LEX_CJK_UNITS__;
	}
	else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
		return __LEX_ECM_WORDS__;
	}
	else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
		return __LEX_CEM_WORDS__;
	}
	else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
		return __LEX_CN_LNAME__;
	}
	else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
		return __LEX_CN_SNAME__;
	}
	else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
		return __LEX_CN_DNAME1__;
	}
	else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
		return __LEX_CN_DNAME2__;
	}
	else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
		return __LEX_CN_LNA__;
	}
	else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
		return __LEX_STOPWORDS__;
	}
	else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
		return __LEX_ENPUN_WORDS__;
	}
	else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
		return __LEX_EN_WORDS__;
	}

	return -1;
}

/*
 * load the lexicon configuration file.
 *		and load all the valid lexicon from the configuration file.
 *
 * @param friso		friso instance
 * @param _path		dictionary directory
 * @param _limitts	words length limit	
 */
FRISO_API void friso_dic_load_from_ifile( 
		friso_t friso, 
		friso_config_t config,
		fstring _path,
		uint_t _limits  ) 
{

	//1.parse the configuration file.
	FILE * __stream;
	char __chars__[1024], __key__[30], *__line__;
	uint_t __length__, i, t;
	friso_lex_t lex_t;
	string_buffer_t sb;

	//get the lexicon configruation file path
	sb = new_string_buffer();
	string_buffer_append( sb, _path );
	if ( _path[ strlen(_path) - 1 ] != '/' ) 
		string_buffer_append( sb, "/" );
	string_buffer_append( sb, __FRISO_LEX_IFILE__ );

	if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL ) 
	{
		while ( ( __line__ = 
					file_get_line( __chars__, __stream ) ) != NULL ) 
		{
			//comment filter.
			if ( __line__[0] == '#' ) continue;
			if ( __line__[0] == '\0' ) continue;

			__length__ = strlen( __line__ );
			//item start
			if ( __line__[ __length__ - 1 ] == '[' ) 
			{
				//get the type key
				for ( i = 0; i < __length__
						&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ ); 
				for ( t = 0; i < __length__; i++,t++ ) {
					if ( __line__[i] == ' ' 
							|| __line__[i] == '\t' || __line__[i] == ':' ) break;
					__key__[t] = __line__[i];
				}
				__key__[t] = '\0';

				//get the lexicon type
				lex_t = get_lexicon_type_with_constant(__key__);
				if ( lex_t == -1 ) continue; 

				//printf("key=%s, type=%d\n", __key__, lex_t );
				while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) 
				{
					//comments filter.
					if ( __line__[0] == '#' ) continue;
					if ( __line__[0] == '\0' ) continue; 

					__length__ = strlen( __line__ );
					if ( __line__[ __length__ - 1 ] == ']' ) break;

					for ( i = 0; i < __length__ 
							&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
					for ( t = 0; i < __length__; i++,t++ ) {
						if ( __line__[i] == ' ' 
								|| __line__[i] == '\t' || __line__[i] == ';' ) break;
						__key__[t] = __line__[i]; 
					}
					__key__[t] = '\0';

					//load the lexicon item from the lexicon file.
					string_buffer_clear( sb );
					string_buffer_append( sb, _path );
					string_buffer_append( sb, __key__ );
					//printf("key=%s, type=%d\n", __key__, lex_t);
					friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
				}

			} 

		} //end while

		fclose( __stream );
	} else {
		printf("Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
	}

	free_string_buffer(sb);	
}

//match the item.
FRISO_API int friso_dic_match( 
		friso_dic_t dic, 
		friso_lex_t lex, 
		fstring word ) 
{
	if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
		return hash_exist_mapping( dic[lex], word );
	}
	return 0;
}

//get the lex_entry_t associated with the word.
FRISO_API lex_entry_t friso_dic_get( 
		friso_dic_t dic, 
		friso_lex_t lex, 
		fstring word ) 
{
	if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
		return ( lex_entry_t ) hash_get_value( dic[lex], word );
	}
	return NULL;
}

//get the size of the specified type dictionary.
FRISO_API uint_t friso_spec_dic_size( 
		friso_dic_t dic, 
		friso_lex_t lex ) 
{
	if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
		return hash_get_size( dic[lex] );
	}
	return 0;
}

//get size of the whole dictionary.
FRISO_API uint_t friso_all_dic_size( 
		friso_dic_t dic ) 
{
	register uint_t size = 0, t;

	for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
		size += hash_get_size( dic[t] );
	}

	return size;
}
